import os, gzip, re, time, bsddb, struct
from util import *
from BeautifulSoup import *

__DB__ = 'db.url' 
__WEBGRAPH__ = 'db.webgraph'
__INPUT__ = os.path.join(os.path.split(os.getcwd())[0],
    ('WebBase/1.fwl/',
    )[0]
)
def main():
    list_url = []
    # List file in DataTest (hostname == filename)
    for id,hostname in zip(range(1000),os.listdir(__INPUT__)):
        start_t = time.time()
        
        site = open(__INPUT__+hostname,'r').readlines()

        for m in site:
            if 'http://' in m and 'image' not in m:
                list_url.append( m.strip() )
    print set(list_url)
    print len(list(set(list_url)))

if __name__ == "__main__" :
    start_t = time.time()
    main()
    print 'Overall time usage:',time.time()-start_t,'sec'
